water_level<-read.csv("chennai_reservoir_levels.csv")
dim(water_level)
head(water_level)
| Date | POONDI | CHOLAVARAM | REDHILLS | CHEMBARAMBAKKAM | |
|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 1 | 01-01-2004 | 3.9 | 0 | 268 | 0 |
| 2 | 02-01-2004 | 3.9 | 0 | 268 | 0 |
| 3 | 03-01-2004 | 3.9 | 0 | 267 | 0 |
| 4 | 04-01-2004 | 3.9 | 0 | 267 | 0 |
| 5 | 05-01-2004 | 3.8 | 0 | 267 | 0 |
| 6 | 06-01-2004 | 3.8 | 0 | 266 | 0 |
str(water_level)
'data.frame': 6182 obs. of 5 variables: $ Date : chr "01-01-2004" "02-01-2004" "03-01-2004" "04-01-2004" ... $ POONDI : num 3.9 3.9 3.9 3.9 3.8 3.8 3.8 3.7 3.7 3.7 ... $ CHOLAVARAM : num 0 0 0 0 0 0 0 0 0 0 ... $ REDHILLS : num 268 268 267 267 267 266 266 265 264 264 ... $ CHEMBARAMBAKKAM: num 0 0 0 0 0 0 0 0 0 0 ...
summary(water_level)
Date POONDI CHOLAVARAM REDHILLS
Length:6182 Min. : 0.9 Min. : 0.0 Min. : 0.0
Class :character 1st Qu.: 202.0 1st Qu.: 26.0 1st Qu.: 847.2
Mode :character Median : 783.1 Median : 89.0 Median :1685.0
Mean :1106.7 Mean :226.4 Mean :1592.3
3rd Qu.:1918.0 3rd Qu.:411.0 3rd Qu.:2320.0
Max. :3231.0 Max. :896.0 Max. :3300.0
CHEMBARAMBAKKAM
Min. : 0.0
1st Qu.: 459.2
Median :1264.0
Mean :1321.5
3rd Qu.:2046.8
Max. :3396.0
water_level$Date=as.POSIXlt(water_level$Date,format="%d-%m-%Y")
str(water_level)
'data.frame': 6182 obs. of 5 variables: $ Date : POSIXlt, format: "2004-01-01" "2004-01-02" ... $ POONDI : num 3.9 3.9 3.9 3.9 3.8 3.8 3.8 3.7 3.7 3.7 ... $ CHOLAVARAM : num 0 0 0 0 0 0 0 0 0 0 ... $ REDHILLS : num 268 268 267 267 267 266 266 265 264 264 ... $ CHEMBARAMBAKKAM: num 0 0 0 0 0 0 0 0 0 0 ...
any(is.na(water_level))
sum(is.na(water_level))
print("Reservoirs in Chennai:")
res=names(water_level)
res[-1]
[1] "Reservoirs in Chennai:"
rainfall<-read.csv("chennai_reservoir_rainfall.csv")
dim(rainfall)
head(rainfall)
| Date | POONDI | CHOLAVARAM | REDHILLS | CHEMBARAMBAKKAM | |
|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | |
| 1 | 01-01-2004 | 0 | 0 | 0 | 0 |
| 2 | 02-01-2004 | 0 | 0 | 0 | 0 |
| 3 | 03-01-2004 | 0 | 0 | 0 | 0 |
| 4 | 04-01-2004 | 0 | 0 | 0 | 0 |
| 5 | 05-01-2004 | 0 | 0 | 0 | 0 |
| 6 | 06-01-2004 | 0 | 0 | 0 | 0 |
rainfall$Date=as.POSIXlt(rainfall$Date,format="%d-%m-%Y")
str(rainfall)
'data.frame': 6182 obs. of 5 variables: $ Date : POSIXlt, format: "2004-01-01" "2004-01-02" ... $ POONDI : num 0 0 0 0 0 0 0 0 0 0 ... $ CHOLAVARAM : num 0 0 0 0 0 0 0 0 0 0 ... $ REDHILLS : num 0 0 0 0 0 0 0 0 0 0 ... $ CHEMBARAMBAKKAM: num 0 0 0 0 0 0 0 0 0 0 ...
summary(rainfall)
Date POONDI CHOLAVARAM
Min. :2004-01-01 00:00:00 Min. : 0.000 Min. : 0.000
1st Qu.:2008-03-25 06:00:00 1st Qu.: 0.000 1st Qu.: 0.000
Median :2012-06-17 12:00:00 Median : 0.000 Median : 0.000
Mean :2012-06-17 12:00:00 Mean : 3.461 Mean : 3.728
3rd Qu.:2016-09-09 18:00:00 3rd Qu.: 0.000 3rd Qu.: 0.000
Max. :2020-12-03 00:00:00 Max. :300.000 Max. :293.000
REDHILLS CHEMBARAMBAKKAM
Min. : 0.000 Min. : 0.000
1st Qu.: 0.000 1st Qu.: 0.000
Median : 0.000 Median : 0.000
Mean : 3.833 Mean : 3.973
3rd Qu.: 0.000 3rd Qu.: 0.000
Max. :320.000 Max. :475.000
any(is.na(rainfall))
sum(is.na(rainfall))
water_quality<-read.csv("water_dataX.csv",fileEncoding = "Latin1")
dim(water_quality)
head(water_quality)
| STATION.CODE | LOCATIONS | STATE | Temp | D.O...mg.l. | PH | CONDUCTIVITY..µmhos.cm. | B.O.D...mg.l. | NITRATENAN.N..NITRITENANN..mg.l. | FECAL.COLIFORM..MPN.100ml. | TOTAL.COLIFORM..MPN.100ml.Mean | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> | <chr> | <chr> | <int> | |
| 1 | 1393 | DAMANGANGA AT D/S OF MADHUBAN, DAMAN | DAMAN & DIU | 30.6 | 6.7 | 7.5 | 203 | NAN | 0.1 | 11 | 27 | 2014 |
| 2 | 1399 | ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOINS, GOA | GOA | 29.8 | 5.7 | 7.2 | 189 | 2 | 0.2 | 4953 | 8391 | 2014 |
| 3 | 1475 | ZUARI AT PANCHAWADI | GOA | 29.5 | 6.3 | 6.9 | 179 | 1.7 | 0.1 | 3243 | 5330 | 2014 |
| 4 | 3181 | RIVER ZUARI AT BORIM BRIDGE | GOA | 29.7 | 5.8 | 6.9 | 64 | 3.8 | 0.5 | 5382 | 8443 | 2014 |
| 5 | 3182 | RIVER ZUARI AT MARCAIM JETTY | GOA | 29.5 | 5.8 | 7.3 | 83 | 1.9 | 0.4 | 3428 | 5500 | 2014 |
| 6 | 1400 | MANDOVI AT NEGHBOURHOOD OF PANAJI, GOA | GOA | 30.0 | 5.5 | 7.4 | 81 | 1.5 | 0.1 | 2853 | 4049 | 2014 |
summary(water_quality)
str(water_quality)
STATION.CODE LOCATIONS STATE Temp
Length:1991 Length:1991 Length:1991 Min. :10.00
Class :character Class :character Class :character 1st Qu.:24.75
Mode :character Mode :character Mode :character Median :27.00
Mean :26.21
3rd Qu.:28.40
Max. :35.00
NA's :92
D.O...mg.l. PH CONDUCTIVITY..µmhos.cm. B.O.D...mg.l.
Min. : 0.000 Min. : 0.0 Length:1991 Length:1991
1st Qu.: 5.900 1st Qu.: 6.9 Class :character Class :character
Median : 6.700 Median : 7.3 Mode :character Mode :character
Mean : 6.393 Mean : 112.1
3rd Qu.: 7.200 3rd Qu.: 7.7
Max. :11.400 Max. :67115.0
NA's :31 NA's :8
NITRATENAN.N..NITRITENANN..mg.l. FECAL.COLIFORM..MPN.100ml.
Min. : 0.000 Length:1991
1st Qu.: 0.240 Class :character
Median : 0.516 Mode :character
Mean : 1.623
3rd Qu.: 1.500
Max. :108.700
NA's :225
TOTAL.COLIFORM..MPN.100ml.Mean year
Length:1991 Min. :2003
Class :character 1st Qu.:2008
Mode :character Median :2011
Mean :2010
3rd Qu.:2013
Max. :2014
'data.frame': 1991 obs. of 12 variables: $ STATION.CODE : chr "1393" "1399" "1475" "3181" ... $ LOCATIONS : chr "DAMANGANGA AT D/S OF MADHUBAN, DAMAN" "ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOINS, GOA" "ZUARI AT PANCHAWADI" "RIVER ZUARI AT BORIM BRIDGE" ... $ STATE : chr "DAMAN & DIU" "GOA" "GOA" "GOA" ... $ Temp : num 30.6 29.8 29.5 29.7 29.5 30 29.2 29.6 30 30.1 ... $ D.O...mg.l. : num 6.7 5.7 6.3 5.8 5.8 5.5 6.1 6.4 6.4 6.3 ... $ PH : num 7.5 7.2 6.9 6.9 7.3 7.4 6.7 6.7 7.6 7.6 ... $ CONDUCTIVITY..µmhos.cm. : chr "203" "189" "179" "64" ... $ B.O.D...mg.l. : chr "NAN" "2" "1.7" "3.8" ... $ NITRATENAN.N..NITRITENANN..mg.l.: num 0.1 0.2 0.1 0.5 0.4 0.1 0.3 0.2 0.1 0.1 ... $ FECAL.COLIFORM..MPN.100ml. : chr "11" "4953" "3243" "5382" ... $ TOTAL.COLIFORM..MPN.100ml.Mean : chr "27" "8391" "5330" "8443" ... $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
any(is.na(water_quality))
sum(is.na(water_quality))
Omitting NA Values
water_quality_clean<-na.omit(water_quality)
any(is.na(water_quality_clean))
sum(is.na(water_quality_clean))
library(dplyr)
library(tidyverse)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Warning message in system("timedatectl", intern = TRUE):
“running command 'timedatectl' had status 1”
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.5 ✔ purrr 0.3.4
✔ tibble 3.1.6 ✔ stringr 1.4.0
✔ tidyr 1.1.4 ✔ forcats 0.5.1
✔ readr 2.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
#checking for any data loss after omitting NA values
a<-water_quality_clean%>%
filter(STATE=='TAMILNADU')
dim(a)
b<-water_quality%>%
filter(STATE=='TAMILNADU')
dim(b)
Changing Character Values to Numeric Datatype
water_quality_clean$STATION.CODE<-as.numeric(water_quality_clean$STATION.CODE)
water_quality_clean$`CONDUCTIVITY..µmhos.cm.`<-as.numeric(water_quality_clean$`CONDUCTIVITY..µmhos.cm.`)
water_quality_clean$B.O.D...mg.l.<-as.numeric(water_quality_clean$B.O.D...mg.l.)
water_quality_clean$FECAL.COLIFORM..MPN.100ml.<-as.numeric(water_quality_clean$FECAL.COLIFORM..MPN.100ml.)
water_quality_clean$TOTAL.COLIFORM..MPN.100ml.Mean<-as.numeric(water_quality_clean$TOTAL.COLIFORM..MPN.100ml.Mean)
str(water_quality_clean)
summary(water_quality_clean)
any(is.na(water_quality_clean))
sum(is.na(water_quality_clean))
'data.frame': 1679 obs. of 12 variables: $ STATION.CODE : num 1393 1399 1475 3181 3182 ... $ LOCATIONS : chr "DAMANGANGA AT D/S OF MADHUBAN, DAMAN" "ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOINS, GOA" "ZUARI AT PANCHAWADI" "RIVER ZUARI AT BORIM BRIDGE" ... $ STATE : chr "DAMAN & DIU" "GOA" "GOA" "GOA" ... $ Temp : num 30.6 29.8 29.5 29.7 29.5 30 29.2 29.6 30 30.1 ... $ D.O...mg.l. : num 6.7 5.7 6.3 5.8 5.8 5.5 6.1 6.4 6.4 6.3 ... $ PH : num 7.5 7.2 6.9 6.9 7.3 7.4 6.7 6.7 7.6 7.6 ... $ CONDUCTIVITY..µmhos.cm. : num 203 189 179 64 83 81 308 414 305 77 ... $ B.O.D...mg.l. : num NaN 2 1.7 3.8 1.9 1.5 1.4 1 2.2 2.3 ... $ NITRATENAN.N..NITRITENANN..mg.l.: num 0.1 0.2 0.1 0.5 0.4 0.1 0.3 0.2 0.1 0.1 ... $ FECAL.COLIFORM..MPN.100ml. : num 11 4953 3243 5382 3428 ... $ TOTAL.COLIFORM..MPN.100ml.Mean : num 27 8391 5330 8443 5500 ... $ year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ... - attr(*, "na.action")= 'omit' Named int [1:312] 155 174 187 188 193 246 249 251 255 257 ... ..- attr(*, "names")= chr [1:312] "155" "174" "187" "188" ...
STATION.CODE LOCATIONS STATE Temp
Min. : 17 Length:1679 Length:1679 Min. :10.00
1st Qu.:1443 Class :character Class :character 1st Qu.:25.00
Median :1726 Mode :character Mode :character Median :27.00
Mean :1952 Mean :26.29
3rd Qu.:2423 3rd Qu.:28.40
Max. :3473 Max. :35.00
NA's :55
D.O...mg.l. PH CONDUCTIVITY..µmhos.cm. B.O.D...mg.l.
Min. : 0.000 Min. : 2.600 Min. : 3.7 Min. : 0.10
1st Qu.: 5.900 1st Qu.: 6.900 1st Qu.: 77.0 1st Qu.: 1.10
Median : 6.700 Median : 7.300 Median : 188.0 Median : 1.80
Mean : 6.328 Mean : 65.675 Mean : 1982.3 Mean : 6.08
3rd Qu.: 7.100 3rd Qu.: 7.658 3rd Qu.: 663.8 3rd Qu.: 3.60
Max. :10.000 Max. :28598.000 Max. :65700.0 Max. :534.50
NA's :3 NA's :22
NITRATENAN.N..NITRITENANN..mg.l. FECAL.COLIFORM..MPN.100ml.
Min. : 0.000 Min. : 0
1st Qu.: 0.250 1st Qu.: 28
Median : 0.530 Median : 228
Mean : 1.651 Mean : 382007
3rd Qu.: 1.571 3rd Qu.: 993
Max. :108.700 Max. :272521616
NA's :93
TOTAL.COLIFORM..MPN.100ml.Mean year
Min. : 0 Min. :2003
1st Qu.: 135 1st Qu.:2008
Median : 574 Median :2011
Mean : 613536 Mean :2010
3rd Qu.: 2264 3rd Qu.:2013
Max. :511090873 Max. :2014
NA's :73
#checking for NA values in TamilNadu data
a<-water_quality_clean%>%
filter(STATE=='TAMILNADU')
sum(is.na(a))
list_na<-colnames(water_quality_clean)[apply(water_quality_clean,2,anyNA)]
list_na
Replacing the NA Values from it's mean and median Value
water_quality_mean<-apply(water_quality_clean[,colnames(water_quality_clean) %in% list_na],2,mean,na.rm=TRUE)
water_quality_mean
water_quality_clean %>%
summarize(avg=mean(STATION.CODE,na.rm=TRUE),med=median(STATION.CODE,na.rm=TRUE))
water_quality_clean %>%
summarize(avg=mean(CONDUCTIVITY..µmhos.cm.,na.rm=TRUE),med=median(CONDUCTIVITY..µmhos.cm.,na.rm=TRUE))
water_quality_clean %>%
summarize(avg=mean(B.O.D...mg.l.,na.rm=TRUE),med=median(B.O.D...mg.l.,na.rm=TRUE))
water_quality_clean %>%
summarize(avg=mean(FECAL.COLIFORM..MPN.100ml.,na.rm=TRUE),med=median(FECAL.COLIFORM..MPN.100ml.,na.rm=TRUE))
water_quality_clean %>%
summarize(avg=mean(TOTAL.COLIFORM..MPN.100ml.Mean,na.rm=TRUE),med=median(TOTAL.COLIFORM..MPN.100ml.Mean,na.rm=TRUE))
| avg | med |
|---|---|
| <dbl> | <dbl> |
| 1952.099 | 1726 |
| avg | med |
|---|---|
| <dbl> | <dbl> |
| 1982.327 | 188 |
| avg | med |
|---|---|
| <dbl> | <dbl> |
| 6.080246 | 1.8 |
| avg | med |
|---|---|
| <dbl> | <dbl> |
| 382007.1 | 227.5 |
| avg | med |
|---|---|
| <dbl> | <dbl> |
| 613535.8 | 574.5 |
head(water_quality_clean)
| STATION.CODE | LOCATIONS | STATE | Temp | D.O...mg.l. | PH | CONDUCTIVITY..µmhos.cm. | B.O.D...mg.l. | NITRATENAN.N..NITRITENANN..mg.l. | FECAL.COLIFORM..MPN.100ml. | TOTAL.COLIFORM..MPN.100ml.Mean | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| <dbl> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <int> | |
| 1 | 1393 | DAMANGANGA AT D/S OF MADHUBAN, DAMAN | DAMAN & DIU | 30.6 | 6.7 | 7.5 | 203 | NaN | 0.1 | 11 | 27 | 2014 |
| 2 | 1399 | ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOINS, GOA | GOA | 29.8 | 5.7 | 7.2 | 189 | 2.0 | 0.2 | 4953 | 8391 | 2014 |
| 3 | 1475 | ZUARI AT PANCHAWADI | GOA | 29.5 | 6.3 | 6.9 | 179 | 1.7 | 0.1 | 3243 | 5330 | 2014 |
| 4 | 3181 | RIVER ZUARI AT BORIM BRIDGE | GOA | 29.7 | 5.8 | 6.9 | 64 | 3.8 | 0.5 | 5382 | 8443 | 2014 |
| 5 | 3182 | RIVER ZUARI AT MARCAIM JETTY | GOA | 29.5 | 5.8 | 7.3 | 83 | 1.9 | 0.4 | 3428 | 5500 | 2014 |
| 6 | 1400 | MANDOVI AT NEGHBOURHOOD OF PANAJI, GOA | GOA | 30.0 | 5.5 | 7.4 | 81 | 1.5 | 0.1 | 2853 | 4049 | 2014 |
Cleaning of Outlayered Data for reservoir level
glimpse(water_level)
Rows: 6,182 Columns: 5 $ Date <dttm> 2004-01-01, 2004-01-02, 2004-01-03, 2004-01-04, 2004-… $ POONDI <dbl> 3.9, 3.9, 3.9, 3.9, 3.8, 3.8, 3.8, 3.7, 3.7, 3.7, 3.6,… $ CHOLAVARAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … $ REDHILLS <dbl> 268, 268, 267, 267, 267, 266, 266, 265, 264, 264, 263,… $ CHEMBARAMBAKKAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
glimpse(rainfall)
Rows: 6,182 Columns: 5 $ Date <dttm> 2004-01-01, 2004-01-02, 2004-01-03, 2004-01-04, 2004-… $ POONDI <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … $ CHOLAVARAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … $ REDHILLS <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, … $ CHEMBARAMBAKKAM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
glimpse(water_quality_clean)
Rows: 1,679 Columns: 12 $ STATION.CODE <dbl> 1393, 1399, 1475, 3181, 3182, 1400, 1… $ LOCATIONS <chr> "DAMANGANGA AT D/S OF MADHUBAN, DAMAN… $ STATE <chr> "DAMAN & DIU", "GOA", "GOA", "GOA", "… $ Temp <dbl> 30.6, 29.8, 29.5, 29.7, 29.5, 30.0, 2… $ D.O...mg.l. <dbl> 6.7, 5.7, 6.3, 5.8, 5.8, 5.5, 6.1, 6.… $ PH <dbl> 7.5, 7.2, 6.9, 6.9, 7.3, 7.4, 6.7, 6.… $ CONDUCTIVITY..µmhos.cm. <dbl> 203, 189, 179, 64, 83, 81, 308, 414, … $ B.O.D...mg.l. <dbl> NaN, 2.0, 1.7, 3.8, 1.9, 1.5, 1.4, 1.… $ NITRATENAN.N..NITRITENANN..mg.l. <dbl> 0.1, 0.2, 0.1, 0.5, 0.4, 0.1, 0.3, 0.… $ FECAL.COLIFORM..MPN.100ml. <dbl> 11, 4953, 3243, 5382, 3428, 2853, 335… $ TOTAL.COLIFORM..MPN.100ml.Mean <dbl> 27, 8391, 5330, 8443, 5500, 4049, 567… $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2…
level<-water_level%>%
filter(water_level$POONDI>0 & water_level$CHOLAVARAM>0 & water_level$REDHILLS>0 & water_level$CHEMBARAMBAKKAM>0)
water_level<-separate(water_level,Date,c("Year","Month","Day"),sep="-")
names(water_level)
rainfall<-rainfall%>%
filter(rainfall$POONDI>0 & rainfall$CHOLAVARAM>0 & rainfall$REDHILLS>0 & rainfall$CHEMBARAMBAKKAM>0)
rainfall_level<-separate(rainfall,Date,c("Year","Month","Day"),sep="-")
names(rainfall_level)
water_quality_level<-water_quality_clean%>%
filter(water_quality_clean$STATION.CODE!=NA & water_quality_clean$`CONDUCTIVITY..µmhos.cm.`!=NA & water_quality_clean$B.O.D...mg.l.!=NA & water_quality_clean$FECAL.COLIFORM..MPN.100ml.!=NA & water_quality_clean$TOTAL.COLIFORM..MPN.100ml.Mean!=NA)
names(water_quality_level)
glimpse(water_quality_clean)
Rows: 1,679 Columns: 12 $ STATION.CODE <dbl> 1393, 1399, 1475, 3181, 3182, 1400, 1… $ LOCATIONS <chr> "DAMANGANGA AT D/S OF MADHUBAN, DAMAN… $ STATE <chr> "DAMAN & DIU", "GOA", "GOA", "GOA", "… $ Temp <dbl> 30.6, 29.8, 29.5, 29.7, 29.5, 30.0, 2… $ D.O...mg.l. <dbl> 6.7, 5.7, 6.3, 5.8, 5.8, 5.5, 6.1, 6.… $ PH <dbl> 7.5, 7.2, 6.9, 6.9, 7.3, 7.4, 6.7, 6.… $ CONDUCTIVITY..µmhos.cm. <dbl> 203, 189, 179, 64, 83, 81, 308, 414, … $ B.O.D...mg.l. <dbl> NaN, 2.0, 1.7, 3.8, 1.9, 1.5, 1.4, 1.… $ NITRATENAN.N..NITRITENANN..mg.l. <dbl> 0.1, 0.2, 0.1, 0.5, 0.4, 0.1, 0.3, 0.… $ FECAL.COLIFORM..MPN.100ml. <dbl> 11, 4953, 3243, 5382, 3428, 2853, 335… $ TOTAL.COLIFORM..MPN.100ml.Mean <dbl> 27, 8391, 5330, 8443, 5500, 4049, 567… $ year <int> 2014, 2014, 2014, 2014, 2014, 2014, 2…
For water level and rain level histogram plots:
x-axis: water/rain level
y-axis: count(frequency)
Note: Inferences for the plots are given below them wherever necessary.
hist(water_level$POONDI)
boxplot(water_level$POONDI, main = "Boxplot on POONDI Data")
library(ggplot2)
options(repr.plot.width=15, repr.plot.height=10)
ggplot(data = water_level,aes(x=Year,y=POONDI))+geom_boxplot()+labs(title="Water level in POONDI",x="Year",y="Water Level in POONDI")
ggplot(data=water_level,aes(x=Day,y=Year,fill=POONDI))+geom_tile()+labs(title="Water Level in POONDI",x="Water Level per day",y="Water Level in POONDI per Year")
ggplot(data=rainfall_level,aes(x=Day,y=Year,fill=POONDI))+geom_tile()+labs(title="Rainfall Level in POONDI",x="Rain Level per day",y="Rain Level in POONDI per Year")
ggplot(data=water_level,aes(x=Month,y=Year,fill=POONDI))+geom_tile()+labs(title="Water Level in POONDI",x="Water Level in POONDI per month",y="Year")
We can observe that even with less rainfall for the year 2020, it has got somewhat better water level than 2019. This could be the result of action taken by the concerned authorities to replenish the water level with other resources available. Also some amount of rainfall in month of JAN & APRIL accounts to it.
ggplot(data=rainfall_level,aes(x=Month,y=Year,fill=POONDI))+geom_tile()+labs(title="Rainfall Level in POONDI",x="Rain Level in POONDI per month",y="Year")
Rainfall for the region has been from the mid year till the end. Amount of rainfall is not good but the rainfall level each month accounts to the water level maintenance. It also shows the decrease in rainfall level in recent years.
ggplot(data = water_level,aes(x=POONDI,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Density plot for Rain Level in POONDI",x="Rain Level in POONDI",y="Density Frequency")
ggplot(data = water_level,aes(x=POONDI,fill=Month))+geom_density(alpha = 0.1)+scale_x_log10()
ggplot(data=water_level,aes(x=Month,y=POONDI))+geom_boxplot()+labs(title = "water Level in POONDI",x="Month",y="Rain Level in POONDI")
We can observe the outliers present in the water level data for different months in POONDI region. August and september have got outliers in data. Also we can observe the highest water level in the month of December.
ggplot(water_level,aes(x=POONDI,fill=Year))+geom_histogram()+ggtitle("Water level POONDI")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
With the increasing years, water level for POONDI reservoir has been decreasing. This shows that the proper replenishing of the water level is not met.
ggplot(rainfall_level,aes(x=POONDI,fill=Year))+geom_histogram()+ggtitle("Rain level in POONDI")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Good Rainfall has never been a case for Chennai but in recent times the frequency of the rainfall has also decreased. From the graph its clear that, earlier the frequency of rainfall was more even with less rainfall level.
#for year 2020
y2020=water_level%>%
filter(Year=='2020')
ggplot(y2020,aes(x=POONDI,fill=Month))+geom_histogram()+ggtitle("Water level POONDI 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Observing the water level trend for the last year, we can infer that even during the monsoon seasons (July-Sept) the water level is not good. For only a few instances, it is close to 3000 during the month of December.
#for year 2020
y2020=rainfall_level%>%
filter(Year=='2020')
ggplot(y2020,aes(x=POONDI,fill=Month))+geom_histogram()+ggtitle("Rainfall level POONDI 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The rainfall for the monsoon season last year has been bad. The region has been receiving very less rainfall. There has been some amount of rainfall for Nov, Dec which has replenished the water level to some extent.
ggplot(data = water_level,aes(x=Year,y=CHOLAVARAM))+geom_boxplot()+labs(title="Water level in CHOLAVARAM",x="Year",y="Water Level in CHOLAVARAM")
ggplot(data=water_level,aes(x=Day,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Water Level in CHOLAVARAM",x="Water Level per day",y="Water Level in CHOLAVARAM per Year")
ggplot(data=rainfall_level,aes(x=Day,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Rainfall Level in CHOLAVARAM",x="Rain Level per day",y="Rain Level in CHOLAVARAM per Year")
ggplot(data=water_level,aes(x=Month,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Water Level in CHOLAVARAM",x="Water Level in CHOLAVARAM per month",y="Year")
We can observe the same here, the serious decline in water and rainfall level over the years. Also the rainfall for this region as well is form mid year till the end.
ggplot(data=rainfall_level,aes(x=Month,y=Year,fill=CHOLAVARAM))+geom_tile()+labs(title="Rainfall Level in CHOLAVARAM",x="Rain Level in CHOLAVARAM per month",y="Year")
ggplot(data = water_level,aes(x=CHOLAVARAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Density plot for Rain Level in CHOLAVARAM",x="Rain Level in CHOLAVARAM",y="Density Frequency")
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 1207 rows containing non-finite values (stat_density).”
ggplot(data = water_level,aes(x=CHOLAVARAM,fill=Month))+geom_density(alpha = 0.1)+scale_x_log10()
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 1207 rows containing non-finite values (stat_density).”
ggplot(water_level,aes(x=CHOLAVARAM,fill=Year))+geom_histogram()+ggtitle("Water level CHOLAVARAM")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can observe the decline in water level over the years. In the recent years the region has got the least water level. This region's condition is even worse than the POONDI region.
ggplot(rainfall_level,aes(x=CHOLAVARAM,fill=Year))+geom_histogram()+ggtitle("Rain level in CHOLAVARAM")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can observe the decrease in rainfall level over the years, but the decrease in frequency of rainfall is more of a concern. Even Small amounts of rainfall can account to maintenance of water level but this region is hit even harder in terms of rainfall.
y12020=water_level%>%
filter(Year=='2020')
ggplot(y12020,aes(x=CHOLAVARAM,fill=Month))+geom_histogram()+ggtitle("Water level CHOLAVARAM 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can observe the results of water level for this drastically hit region here. It is least for the most part of the year, some rainfall in last 3 months have brought slight increase in the water level.
y12020=rainfall_level%>%
filter(Year=='2020')
ggplot(y12020,aes(x=CHOLAVARAM,fill=Month))+geom_histogram()+ggtitle("Rainfall level CHOLAVARAM 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Most of the rainfall for previous year has been in the month of November. Also we can see more frequent but small amount of rainfall for July. Low or no rainfall for others months justifies the water level of the region.
ggplot(data = water_level,aes(x=Year,y=REDHILLS))+geom_boxplot()+labs(title="Water level in REDHILLS",x="Year",y="Water Level in REDHILLS")
ggplot(data=water_level,aes(x=Day,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Water Level in REDHILLS",x="Water Level per day",y="Water Level in REDHILLS per Year")
ggplot(data=rainfall_level,aes(x=Day,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Rainfall Level in REDHILLS",x="Rain Level per day",y="Rain Level in REDHILLS per Year")
ggplot(data=water_level,aes(x=Month,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Water Level in REDHILLS",x="Water Level in REDHILLS per month",y="Year")
In the REDHILLS Region the amount of water level in the reservoir has been almost constant. But in the month of April the required amount of water level is increased and the month of September to October it needs less amount of water due to heavy amount of rainfall.
ggplot(data=rainfall_level,aes(x=Month,y=Year,fill=REDHILLS))+geom_tile()+labs(title="Rainfall Level in REDHILLS",x="Rain Level in REDHILLS per month",y="Year")
From the above result we can infer that 2020 resulted in less amount of rainfall and therefore to accommodate the water crisis, the water level in the reservoir is increased to a comparable amount and therefore the requirement of water at a certain level is fulfilled.
ggplot(data = water_level,aes(x=REDHILLS,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Density plot for Rain Level in REDHILLS",x="Rain Level in REDHILLS",y="Density Frequency")
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 206 rows containing non-finite values (stat_density).”
ggplot(data = water_level,aes(x=REDHILLS,fill=Month))+geom_density(alpha = 0.1)+scale_x_log10()
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 206 rows containing non-finite values (stat_density).”
ggplot(water_level,aes(x=REDHILLS,fill=Year))+geom_histogram()+ggtitle("Water level REDHILLS")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The year 2012 marked the decline in the water levels of Redhill reservoir and since then, it has been decreasing. However, we can see a significant change in the year 2020.
ggplot(rainfall_level,aes(x=REDHILLS,fill=Year))+geom_histogram()+ggtitle("Rain level in REDHILLS")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The rain levels have significantly reduced since 2015 in the Redhill area. We can see how unevenly the rainfall distribution is over different months. Except for the last 2 months, the rain levels have become dropped down.
y22020=water_level%>%
filter(Year=='2020')
ggplot(y22020,aes(x=REDHILLS,fill=Month))+geom_histogram()+ggtitle("Water level REDHILLS 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From the above results we can infer that the month of April shows the highest amount of water level and October shows the least range of water level requirement in the year 2020. And therefore, the main reason for water crisis can be poor replenishment of water level in the reservoir.
y22020=rainfall_level%>%
filter(Year=='2020')
ggplot(y22020,aes(x=REDHILLS,fill=Month))+geom_histogram()+ggtitle("Rainfall level REDHILLS 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From the above graph it is clearly stated that rainfall level is highest in the month of October, and December shows the lowest amount of rainfall in the year 2020.
ggplot(data = water_level,aes(x=Year,y=CHEMBARAMBAKKAM))+geom_boxplot()+labs(title="Water level in CHEMBARAMBAKKAM",x="Year",y="Water Level in CHEMBARAMBAKKAM")
ggplot(data=water_level,aes(x=Day,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Water Level in CHEMBARAMBAKKAM",x="Water Level per day",y="Water Level in CHEMBARAMBAKKAM per Year")
ggplot(data=rainfall_level,aes(x=Day,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Rainfall Level in CHEMBARAMBAKKAM",x="Rain Level per day",y="Rain Level inCHEMBARAMBAKKAM per Year")
ggplot(data=water_level,aes(x=Month,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Water Level in CHEMBARAMBAKKAM",x="Water Level in CHEMBARAMBAKKAM per month",y="Year")
From the above graph the area of Chembarambakkam shows it's highest amount of water level in the year 2016 and in the month of October to November and therefore results in proper replenishment of Water reservoir.
ggplot(data=rainfall_level,aes(x=Month,y=Year,fill=CHEMBARAMBAKKAM))+geom_tile()+labs(title="Rainfall Level in CHEMBARAMBAKKAM",x="Rain Level in CHEMBARAMBAKKAM per month",y="Year")
From the above result the area of Chembarambakkam faces a great amount of downfall in rainfall level after the year of 2016 and therefore is not a good sign of proper amount of water availability.
ggplot(data = water_level,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_density(alpha=0.1)+scale_x_log10()+labs(title="Density plot for Rain Level in CHEMBARAMBAKKAM",x="Rain Level in CHEMBARAMBAKKAM",y="Density Frequency")
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 424 rows containing non-finite values (stat_density).”
ggplot(data = water_level,aes(x=CHEMBARAMBAKKAM,fill=Month))+geom_density(alpha = 0.1)+scale_x_log10()
Warning message: “Transformation introduced infinite values in continuous x-axis” Warning message: “Removed 424 rows containing non-finite values (stat_density).”
ggplot(water_level,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_histogram()+ggtitle("Water level in CHEMBARAMBAKKAM")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We found that that Chembarambakkam was worst hit during 2019 with the water levels dropping below 5000. However, it recovered for the loss in 2020.
ggplot(rainfall_level,aes(x=CHEMBARAMBAKKAM,fill=Year))+geom_histogram()+ggtitle("Rain level in CHEMBARAMBAKKAM")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Just like all other reservoirs, Chembarambakkam received its highest rainfall during November with significantly lower rain in other months of 2020.
y32020=water_level%>%
filter(Year=='2020')
ggplot(y32020,aes(x=CHEMBARAMBAKKAM,fill=Month))+geom_histogram()+ggtitle("Water level CHEMBARAMBAKKAM 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
From the above result of water level in reservoir in the year 2020, month of November has the highest amount of water level in the reservoir and June gives the least amount of water level reservoir level in the year 2020.
y32020=rainfall_level%>%
filter(Year=='2020')
ggplot(y32020,aes(x=CHEMBARAMBAKKAM,fill=Month))+geom_histogram()+ggtitle("Rainfall level CHEMBARAMBAKKAM 2020")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The month of November in the year 2020 shows the highest amount of rainfall and therefore there is less amount of chances of facing any water crisis during that time since the level of water in reservoir and the rainfall level are equally related.
#FOR RAINFALL
ggplot(rainfall_level,aes(x=Year,y=POONDI,fill=Month))+geom_col() +labs(title="Rain Level in POONDI",x="Year",y="Rain level in POONDI")
Overall rainfall has decreased in this region in comparison to the previous year. Most rainfall has occurred during the monsoon season for 2019 while for 2020 its during November.
ggplot(rainfall_level,aes(x=Year,y=CHOLAVARAM,fill=Month))+geom_col() +labs(title="Rain Level in CHOLAVARAM",x="Year",y="Rain level in CHOLAVARAM")
Overall rainfall has decreased in this region in comparison to the previous year. Most rainfall has occurred during October in 2019 while for 2020 its during November.
ggplot(rainfall_level,aes(x=Year,y=REDHILLS,fill=Month))+geom_col() +labs(title="Rain Level in REDHILLS",x="Year",y="Rain level in Redhills")
Overall rainfall has increased in this region in comparison to the previous year. Most rainfall has occurred during October in 2019 while for 2020 its during November and also decent amount in October which resulted in increase in overall rainfall level.
ggplot(rainfall_level,aes(x=Year,y=CHEMBARAMBAKKAM,fill=Month))+geom_col() +labs(title="Rain Level in CHEMBARAMBAKKAM",x="Year",y="Rain level in CHEMBARAMBAKKAM")
Overall rainfall has decreased in this region in comparison to the previous year. Most rainfall has occurred during October in 2019 while for 2020 its during November.
#FOR WATER LEVEL
ggplot(water_level,aes(x=Year,y=POONDI,fill=Month))+geom_col() +labs(title="Water Level in POONDI",x="Year",y="Water level in POONDI")
Water level in this region has increased from the previous years. And the water level is constant for most of the months of the year 2020.
ggplot(water_level,aes(x=Year,y=CHOLAVARAM,fill=Month))+geom_col() +labs(title="Water Level in CHOLAVARAM",x="Year",y="Water level in CHOLAVARAM")
Water level in this region has increased from the previous year but it is very low throughout the year except for the month of November in 2020.
ggplot(water_level,aes(x=Year,y=REDHILLS,fill=Month))+geom_col() +labs(title="water Level in REDHILLS",x="Year",y="Water level in Redhills")
Water level in this region has increased drastically from the previous year. This can be justified by the increase in the rainfall level in this region. Also its constant throughout the year which shows balanced usage and replenishment.
ggplot(water_level,aes(x=Year,y=CHEMBARAMBAKKAM,fill=Month))+geom_col() +labs(title="Water Level in CHEMBARAMBAKKAM",x="Year",y="Water level in CHEMBARAMBAKKAM")
Water level in this region has increased drastically from the previous year. And the water level is constant throughout the year 2020.
#Setting model for elbow plot
water_level1=water_level[3:7]
library(cluster)
set.seed(5)
vec=vector()
for(i in 1:10)
vec[i]=sum(kmeans(water_level,i)$withinss)
vec
plot(1:10,vec,type='b',main="Elbow Plotting",xlab="Number of clusters",ylab="Vector")
To determine the optimal number of clusters, selecting the value of k at the elbow, i.e. the point after which distortion starts decreasing in a linear fashion.
#Unsupervised Learning
kmeans1=kmeans(x=water_level,centers=2)
ykmeans=kmeans1$cluster
clusplot(water_level,ykmeans,lines = 0,color=T,shade=T,main="Clustering Model of Reservoir Level",xlab="Year",ylab="Water Reservoir Level")
After determining the number of clusters the application of k-means clustering is applied to the dataset to check on the groups that has not been explicitly labeled in the data. This clustering analysis results in finding the subgroups of the samples based on the features of the dataset and this algorithm solves the problem of Expectation maximization. Therefore from the result we take that to display the subgroups of the water level dataset the number of clusters to be 2.
install.packages('mclust')
library(mclust)
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
Package 'mclust' version 5.4.8
Type 'citation("mclust")' for citing this R package in publications.
Attaching package: ‘mclust’
The following object is masked from ‘package:purrr’:
map
model_fit<-Mclust(water_level1)
plot(model_fit)
Applying fit model that will give the clustering model of all the regions with respect to Day and therefore predict which region gets the highest amount of rainfall and which region has the highest amount of water level reservoir and therefore from the above results the region which gets the highest amount of rainfall is REDHILLS and the region with highest amount water level reservoir is in the POONDI region.
#Summary of the model
summary(model_fit)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VEV (ellipsoidal, equal shape) model with 9 components:
log-likelihood n df BIC ICL
-194689.8 6182 156 -390741.3 -392369.2
Clustering table:
1 2 3 4 5 6 7 8 9
594 758 459 1470 1027 256 410 548 660
model1<-lm(Month~POONDI,data=water_level)
model1
model2<-lm(Month~CHOLAVARAM,data=water_level)
model2
model3<-lm(Month~CHEMBARAMBAKKAM,data=water_level)
model3
model4<-lm(Month~REDHILLS,data=water_level)
model4
Call: lm(formula = Month ~ POONDI, data = water_level) Coefficients: (Intercept) POONDI 6.9617282 -0.0004194
Call: lm(formula = Month ~ CHOLAVARAM, data = water_level) Coefficients: (Intercept) CHOLAVARAM 6.6831897 -0.0008198
Call:
lm(formula = Month ~ CHEMBARAMBAKKAM, data = water_level)
Coefficients:
(Intercept) CHEMBARAMBAKKAM
6.9863050 -0.0003698
Call: lm(formula = Month ~ REDHILLS, data = water_level) Coefficients: (Intercept) REDHILLS 7.7118843 -0.0007626
#Summary of all 4 models
summary(model1)
summary(model2)
summary(model3)
summary(model4)
Call:
lm(formula = Month ~ POONDI, data = water_level)
Residuals:
Min 1Q Median 3Q Max
-5.9604 -2.8695 0.0403 3.0460 6.3933
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.9617282 0.0645605 107.833 <2e-16 ***
POONDI -0.0004194 0.0000432 -9.709 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.411 on 6180 degrees of freedom
Multiple R-squared: 0.01502, Adjusted R-squared: 0.01486
F-statistic: 94.27 on 1 and 6180 DF, p-value: < 2.2e-16
Call:
lm(formula = Month ~ CHOLAVARAM, data = water_level)
Residuals:
Min 1Q Median 3Q Max
-5.6832 -2.6832 0.3168 2.6226 6.0514
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.6831897 0.0572161 116.806 < 2e-16 ***
CHOLAVARAM -0.0008198 0.0001635 -5.015 5.46e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.43 on 6180 degrees of freedom
Multiple R-squared: 0.004053, Adjusted R-squared: 0.003891
F-statistic: 25.15 on 1 and 6180 DF, p-value: 5.462e-07
Call:
lm(formula = Month ~ CHEMBARAMBAKKAM, data = water_level)
Residuals:
Min 1Q Median 3Q Max
-5.9863 -2.9820 0.0137 3.0137 6.2697
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.9863050 0.0744147 93.883 < 2e-16 ***
CHEMBARAMBAKKAM -0.0003698 0.0000457 -8.094 6.91e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.419 on 6180 degrees of freedom
Multiple R-squared: 0.01049, Adjusted R-squared: 0.01033
F-statistic: 65.51 on 1 and 6180 DF, p-value: 6.914e-16
Call:
lm(formula = Month ~ REDHILLS, data = water_level)
Residuals:
Min 1Q Median 3Q Max
-6.5418 -2.6840 -0.2449 2.5586 6.7757
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.712e+00 8.591e-02 89.77 <2e-16 ***
REDHILLS -7.626e-04 4.678e-05 -16.30 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.366 on 6180 degrees of freedom
Multiple R-squared: 0.04124, Adjusted R-squared: 0.04108
F-statistic: 265.8 on 1 and 6180 DF, p-value: < 2.2e-16